Final Code

Final Code#

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
df = pd.read_csv('Loan_default.csv')
df
LoanID Age Income LoanAmount CreditScore MonthsEmployed NumCreditLines InterestRate LoanTerm DTIRatio Education EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose HasCoSigner Default
0 I38PQUQS96 56 85994 50587 520 80 4 15.23 36 0.44 Bachelor's Full-time Divorced Yes Yes Other Yes 0
1 HPSK72WA7R 69 50432 124440 458 15 1 4.81 60 0.68 Master's Full-time Married No No Other Yes 0
2 C1OZ6DPJ8Y 46 84208 129188 451 26 3 21.17 24 0.31 Master's Unemployed Divorced Yes Yes Auto No 1
3 V2KKSFM3UN 32 31713 44799 743 0 3 7.07 24 0.23 High School Full-time Married No No Business No 0
4 EY08JDHTZP 60 20437 9139 633 8 4 6.51 48 0.73 Bachelor's Unemployed Divorced No Yes Auto No 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
255342 8C6S86ESGC 19 37979 210682 541 109 4 14.11 12 0.85 Bachelor's Full-time Married No No Other No 0
255343 98R4KDHNND 32 51953 189899 511 14 2 11.55 24 0.21 High School Part-time Divorced No No Home No 1
255344 XQK1UUUNGP 56 84820 208294 597 70 3 5.29 60 0.50 High School Self-employed Married Yes Yes Auto Yes 0
255345 JAO28CPL4H 42 85109 60575 809 40 1 20.90 48 0.44 High School Part-time Single Yes Yes Other No 0
255346 ZTH91CGL0B 62 22418 18481 636 113 2 6.73 12 0.48 Bachelor's Unemployed Divorced Yes No Education Yes 0

255347 rows ร— 18 columns

conn = sqlite3.connect('loan_default.db')
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS Clients;")
cur.execute("DROP TABLE IF EXISTS Loans;")
cur.execute("DROP TABLE IF EXISTS Employment;")
cur.execute("DROP TABLE IF EXISTS CreditDetails;")
<sqlite3.Cursor at 0x11fe9b43dc0>
# Generate unique ClientID by grouping client-specific attributes
df['ClientID'] = df.groupby(['Age', 'Income', 'CreditScore', 'EmploymentType',
                             'Education', 'MaritalStatus', 'HasDependents', 'HasMortgage']).ngroup()
create_clients_table = """
CREATE TABLE IF NOT EXISTS Clients (
    ClientID INTEGER PRIMARY KEY,
    Age INTEGER,
    Income REAL,
    CreditScore INTEGER,
    EmploymentType TEXT,
    Education TEXT,
    MaritalStatus TEXT,
    HasDependents TEXT,
    HasMortgage TEXT
);
"""
create_loans_table = """
CREATE TABLE IF NOT EXISTS Loans (
    LoanID TEXT PRIMARY KEY,
    ClientID INTEGER,
    LoanAmount REAL,
    InterestRate REAL,
    LoanTerm INTEGER,
    DTIRatio REAL,
    LoanPurpose TEXT,
    HasCoSigner TEXT,
    [Default] INTEGER,
    FOREIGN KEY(ClientID) REFERENCES Clients(ClientID)
);
"""
create_employment_table = """
CREATE TABLE IF NOT EXISTS Employment (
    EmploymentType TEXT PRIMARY KEY,
    MonthsEmployed INTEGER
);
"""
create_credit_details_table = """
CREATE TABLE IF NOT EXISTS CreditDetails (
    LoanID TEXT PRIMARY KEY,
    NumCreditLines INTEGER,
    FOREIGN KEY(LoanID) REFERENCES Loans(LoanID)
);
"""
for table_sql in [create_clients_table, create_loans_table, create_employment_table, create_credit_details_table]:
    cur.execute(table_sql)
conn.commit()
clients_data = df[['ClientID', 'Age', 'Income', 'CreditScore', 'EmploymentType',
                   'Education', 'MaritalStatus', 'HasDependents', 'HasMortgage']].drop_duplicates()
clients_data.to_sql('Clients', conn, if_exists='replace', index=False)
255347
loans_data = df[['LoanID', 'ClientID', 'LoanAmount', 'InterestRate', 'LoanTerm',
                 'DTIRatio', 'LoanPurpose', 'HasCoSigner', 'Default']]
loans_data.to_sql('Loans', conn, if_exists='replace', index=False)
255347
employment_data = df[['ClientID', 'EmploymentType', 'MonthsEmployed']].drop_duplicates()
employment_data.to_sql('Employment', conn, if_exists='replace', index=False)
255347
credit_details_data = df[['LoanID', 'NumCreditLines']].drop_duplicates()
credit_details_data.to_sql('CreditDetails', conn, if_exists='replace', index=False)
255347
print("Clients Table:")
print(pd.read_sql("SELECT * FROM Clients LIMIT 5", conn))
Clients Table:
   ClientID  Age  Income  CreditScore EmploymentType    Education  \
0    189386   56   85994          520      Full-time   Bachelor's   
1    251796   69   50432          458      Full-time     Master's   
2    140114   46   84208          451     Unemployed     Master's   
3     69067   32   31713          743      Full-time  High School   
4    206578   60   20437          633     Unemployed   Bachelor's   

  MaritalStatus HasDependents HasMortgage  
0      Divorced           Yes         Yes  
1       Married            No          No  
2      Divorced           Yes         Yes  
3       Married            No          No  
4      Divorced           Yes          No  
print("\nLoans Table:")
print(pd.read_sql("SELECT * FROM Loans LIMIT 5", conn))
Loans Table:
       LoanID  ClientID  LoanAmount  InterestRate  LoanTerm  DTIRatio  \
0  I38PQUQS96    189386       50587         15.23        36      0.44   
1  HPSK72WA7R    251796      124440          4.81        60      0.68   
2  C1OZ6DPJ8Y    140114      129188         21.17        24      0.31   
3  V2KKSFM3UN     69067       44799          7.07        24      0.23   
4  EY08JDHTZP    206578        9139          6.51        48      0.73   

  LoanPurpose HasCoSigner  Default  
0       Other         Yes        0  
1       Other         Yes        0  
2        Auto          No        1  
3    Business          No        0  
4        Auto          No        0  
print("\nEmployment Table:")
print(pd.read_sql("SELECT * FROM Employment LIMIT 5", conn))
Employment Table:
   ClientID EmploymentType  MonthsEmployed
0    189386      Full-time              80
1    251796      Full-time              15
2    140114     Unemployed              26
3     69067      Full-time               0
4    206578     Unemployed               8
print("\nCreditDetails Table:")
print(pd.read_sql("SELECT * FROM CreditDetails LIMIT 5", conn))
CreditDetails Table:
       LoanID  NumCreditLines
0  I38PQUQS96               4
1  HPSK72WA7R               1
2  C1OZ6DPJ8Y               3
3  V2KKSFM3UN               3
4  EY08JDHTZP               4
query = """
SELECT DISTINCT
    Loans.LoanID,
    Clients.Age,
    Clients.Income,
    Loans.LoanAmount,
    Clients.CreditScore,
    Employment.MonthsEmployed,
    CreditDetails.NumCreditLines,
    Loans.InterestRate,
    Loans.LoanTerm,
    Loans.DTIRatio,
    Clients.Education,
    Clients.EmploymentType,
    Clients.MaritalStatus,
    Clients.HasMortgage,
    Clients.HasDependents,
    Loans.LoanPurpose,
    Loans.HasCoSigner,
    Loans.[Default]
FROM Loans
JOIN Clients ON Loans.ClientID = Clients.ClientID
JOIN Employment ON Clients.ClientID = Employment.ClientID -- Match ClientID for accuracy
JOIN CreditDetails ON Loans.LoanID = CreditDetails.LoanID;
"""
combined_data = pd.read_sql(query, conn)
print(combined_data)

conn.close()
            LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0       I38PQUQS96   56   85994       50587          520              80   
1       HPSK72WA7R   69   50432      124440          458              15   
2       C1OZ6DPJ8Y   46   84208      129188          451              26   
3       V2KKSFM3UN   32   31713       44799          743               0   
4       EY08JDHTZP   60   20437        9139          633               8   
...            ...  ...     ...         ...          ...             ...   
255342  8C6S86ESGC   19   37979      210682          541             109   
255343  98R4KDHNND   32   51953      189899          511              14   
255344  XQK1UUUNGP   56   84820      208294          597              70   
255345  JAO28CPL4H   42   85109       60575          809              40   
255346  ZTH91CGL0B   62   22418       18481          636             113   

        NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0                    4         15.23        36      0.44   Bachelor's   
1                    1          4.81        60      0.68     Master's   
2                    3         21.17        24      0.31     Master's   
3                    3          7.07        24      0.23  High School   
4                    4          6.51        48      0.73   Bachelor's   
...                ...           ...       ...       ...          ...   
255342               4         14.11        12      0.85   Bachelor's   
255343               2         11.55        24      0.21  High School   
255344               3          5.29        60      0.50  High School   
255345               1         20.90        48      0.44  High School   
255346               2          6.73        12      0.48   Bachelor's   

       EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0           Full-time      Divorced         Yes           Yes       Other   
1           Full-time       Married          No            No       Other   
2          Unemployed      Divorced         Yes           Yes        Auto   
3           Full-time       Married          No            No    Business   
4          Unemployed      Divorced          No           Yes        Auto   
...               ...           ...         ...           ...         ...   
255342      Full-time       Married          No            No       Other   
255343      Part-time      Divorced          No            No        Home   
255344  Self-employed       Married         Yes           Yes        Auto   
255345      Part-time        Single         Yes           Yes       Other   
255346     Unemployed      Divorced         Yes            No   Education   

       HasCoSigner  Default  
0              Yes        0  
1              Yes        0  
2               No        1  
3               No        0  
4               No        0  
...            ...      ...  
255342          No        0  
255343          No        1  
255344         Yes        0  
255345          No        0  
255346         Yes        0  

[255347 rows x 18 columns]
df = combined_data
default_counts = df['Default'].value_counts(normalize=True)  # Proportions
print("Class distribution for 'Default':\n", default_counts)
Class distribution for 'Default':
 Default
0    0.883872
1    0.116128
Name: proportion, dtype: float64
plt.bar(default_counts.index, default_counts.values, tick_label=['No Default', 'Default'])
plt.title('Class Distribution of Default')
plt.xlabel('Default')
plt.ylabel('Proportion')
plt.show()

png

categorical_columns = ['LoanPurpose', 'EmploymentType', 'MaritalStatus']
for col in categorical_columns:
    print(f"\nDistribution for {col}:\n", df[col].value_counts(normalize=True))
    df[col].value_counts(normalize=True).plot(kind='bar', title=f'{col} Distribution')
    plt.show()
Distribution for LoanPurpose:
 LoanPurpose
Business     0.200895
Home         0.200848
Education    0.199748
Other        0.199391
Auto         0.199117
Name: proportion, dtype: float64

png

Distribution for EmploymentType:
 EmploymentType
Part-time        0.251270
Unemployed       0.249950
Self-employed    0.249488
Full-time        0.249292
Name: proportion, dtype: float64

png

Distribution for MaritalStatus:
 MaritalStatus
Married     0.334063
Divorced    0.333010
Single      0.332927
Name: proportion, dtype: float64

png

#only numerical columns
numerical_data = df.select_dtypes(include=['float64', 'int64'])

#correlation matrix
correlation = numerical_data.corr()['Default'].sort_values(ascending=False)
print("\nCorrelation with 'Default':\n", correlation)
Correlation with 'Default':
 Default           1.000000
InterestRate      0.131273
LoanAmount        0.086659
NumCreditLines    0.028330
DTIRatio          0.019236
LoanTerm          0.000545
CreditScore      -0.034166
MonthsEmployed   -0.097374
Income           -0.099119
Age              -0.167783
Name: Default, dtype: float64
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Default', 'LoanID'])
y = df['Default']
X
Age Income LoanAmount CreditScore MonthsEmployed NumCreditLines InterestRate LoanTerm DTIRatio Education EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose HasCoSigner
0 56 85994 50587 520 80 4 15.23 36 0.44 Bachelor's Full-time Divorced Yes Yes Other Yes
1 69 50432 124440 458 15 1 4.81 60 0.68 Master's Full-time Married No No Other Yes
2 46 84208 129188 451 26 3 21.17 24 0.31 Master's Unemployed Divorced Yes Yes Auto No
3 32 31713 44799 743 0 3 7.07 24 0.23 High School Full-time Married No No Business No
4 60 20437 9139 633 8 4 6.51 48 0.73 Bachelor's Unemployed Divorced No Yes Auto No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
255342 19 37979 210682 541 109 4 14.11 12 0.85 Bachelor's Full-time Married No No Other No
255343 32 51953 189899 511 14 2 11.55 24 0.21 High School Part-time Divorced No No Home No
255344 56 84820 208294 597 70 3 5.29 60 0.50 High School Self-employed Married Yes Yes Auto Yes
255345 42 85109 60575 809 40 1 20.90 48 0.44 High School Part-time Single Yes Yes Other No
255346 62 22418 18481 636 113 2 6.73 12 0.48 Bachelor's Unemployed Divorced Yes No Education Yes

255347 rows ร— 16 columns

y
0         0
1         0
2         1
3         0
4         0
         ..
255342    0
255343    1
255344    0
255345    0
255346    0
Name: Default, Length: 255347, dtype: int64
#stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
Train set size: 204277
Test set size: 51070
print("\nTrain 'Default' distribution:\n", y_train.value_counts(normalize=True))
print("\nTest 'Default' distribution:\n", y_test.value_counts(normalize=True))
Train 'Default' distribution:
 Default
0    0.883873
1    0.116127
Name: proportion, dtype: float64

Test 'Default' distribution:
 Default
0    0.883865
1    0.116135
Name: proportion, dtype: float64
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
from ydata_profiling import ProfileReport
profile = ProfileReport(train_data, title="Train Data Profiling Report", explorative=True)
profile.to_file("train_data_profile_report.html")
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]



Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]
profile.to_notebook_iframe()

import dagshub
import mlflow
from mlflow.models import infer_signature
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, confusion_matrix
import mlflow.sklearn
import logging
logging.getLogger("mlflow").setLevel(logging.ERROR)
# Preprocessing for numerical and categorical data
numerical_features = ['Income', 'LoanAmount', 'CreditScore', 'InterestRate', 'DTIRatio']
categorical_features = ['Education', 'EmploymentType', 'LoanPurpose', 'MaritalStatus']

numerical_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

models = {
    "LogisticRegression": LogisticRegression(class_weight="balanced", random_state=42),
    "RidgeClassifier": RidgeClassifier(class_weight="balanced", random_state=42),
    "RandomForestClassifier": RandomForestClassifier(class_weight="balanced", random_state=42),
    "XGBClassifier": XGBClassifier(scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]), random_state=42)
}

# Initialize MLflow tracking
os.environ["MLFLOW_TRACKING_USERNAME"] = "Shruti-1205"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "7b4b8ca624a4d73e3b35f75b6cbe9cc73fb95539"
mlflow.set_tracking_uri("https://dagshub.com/Shruti-1205/my-first-repo.mlflow")
mlflow.set_experiment("Classifier Comparison Experiment")

# Training and logging
f1_scores = {} 
for model_name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    pipeline.fit(X_train, y_train)

    # Evaluation
    y_pred = pipeline.predict(X_test)
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    # Cross-validation for F1-score
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")

    # Store F1-score for plotting
    f1_scores[model_name] = val_f1

    # Logging to MLflow
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("f1_score", val_f1)
        mlflow.log_metric("mean_cv_f1", cv_scores.mean())
        mlflow.log_metric("std_cv_f1", cv_scores.std())
        mlflow.log_metric("tn", tn)
        mlflow.log_metric("fp", fp)
        mlflow.log_metric("fn", fn)
        mlflow.log_metric("tp", tp)

        # Save predictions
        predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
        predictions_file = f"{model_name}_predictions.csv"
        predictions_df.to_csv(predictions_file, index=False)
        mlflow.log_artifact(predictions_file)

        # Log the model
        signature = infer_signature(X_train, pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="models",
            signature=signature,
            input_example=X_train.iloc[:5]
        )

# F1-score plot
import matplotlib.pyplot as plt
plt.bar(f1_scores.keys(), f1_scores.values())
plt.ylabel("F1-Score")
plt.title("Model F1-Score Comparison")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("f1_score_comparison.png")
#plt.show()
mlflow.log_artifact("f1_score_comparison.png")
๐Ÿƒ View run RandomForestClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/408523ea41994765a88707e63e2106ba
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
๐Ÿƒ View run XGBClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/bfd3f21e14bc4de99158f70a8d6465d3
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1

png

mlflow.end_run()
๐Ÿƒ View run thundering-lamb-855 at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/3f8773b8aded4bd3b6120ac807cbf270
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
from sklearn.preprocessing import FunctionTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assuming 'LoanAmount', 'Income', and 'CreditScore' are the first 3 numerical features
        loan_amount_idx = numerical_features.index("LoanAmount")
        income_idx = numerical_features.index("Income")
        credit_score_idx = numerical_features.index("CreditScore")

        # Add new features
        loan_amount_to_income = X[:, loan_amount_idx] / (X[:, income_idx] + 1e-9)
        credit_score_to_income = X[:, credit_score_idx] / (X[:, income_idx] + 1e-9)

        # Append the new features to the array
        X = np.column_stack([X, loan_amount_to_income, credit_score_to_income])
        return X
feature_engineering = FeatureEngineering()

models = {
    "LogisticRegression": LogisticRegression(class_weight="balanced", random_state=42),
    "RidgeClassifier": RidgeClassifier(class_weight="balanced", random_state=42),
    "RandomForestClassifier": RandomForestClassifier(class_weight="balanced", random_state=42),
    "XGBClassifier": XGBClassifier(scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]), random_state=42)
}

for model_name, model in models.items():
    pipeline = ImbPipeline(steps=[
        ("preprocessor", preprocessor),
        ("feature_engineering", feature_engineering),
        ("smote", SMOTE(random_state=42)),
        ("classifier", model)
    ])

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log results to MLflow
    with mlflow.start_run(run_name=f"Experiment #3 - Feature Engineering"):
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("f1_score", val_f1)
        mlflow.log_metric("tn", tn)
        mlflow.log_metric("fp", fp)
        mlflow.log_metric("fn", fn)
        mlflow.log_metric("tp", tp)

        # Save model to MLflow
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="models",
            input_example=X_train.iloc[:5]
        )
๐Ÿƒ View run Experiment #3 - Feature Engineering at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/2847388707ac40fe8c9e68338f0dc506
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
๐Ÿƒ View run Experiment #3 - Feature Engineering at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/2cc850a8a2d94dcb9ad439e37ac9fffe
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
# Select only numerical columns for Random Forest
X_train_numeric = X_train.select_dtypes(include=[np.number])

# Train Random Forest to get feature importances
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_numeric, y_train)

# Get feature importances
importances = rf.feature_importances_
importance_df = pd.DataFrame({
    "Feature": X_train_numeric.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Select top features based on cumulative importance
cumulative_importance = importance_df["Importance"].cumsum()
selected_features = importance_df[cumulative_importance <= 0.95]["Feature"]

# Filter training and testing sets to include only selected features
X_train_imp = X_train_numeric[selected_features]
X_test_imp = X_test[selected_features]

print(f"Selected features based on importance: {selected_features.tolist()}")
Selected features based on importance: ['Income', 'InterestRate', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'DTIRatio', 'Age']
from sklearn.feature_selection import VarianceThreshold

# Apply variance threshold
vt = VarianceThreshold(threshold=0.05)
X_train_vt = vt.fit_transform(X_train_imp)
X_test_vt = vt.transform(X_test_imp)

# Get selected feature names
selected_variance_features = X_train_imp.columns[vt.get_support()]
print(f"Selected features after variance threshold: {selected_variance_features.tolist()}")
Selected features after variance threshold: ['Income', 'InterestRate', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'DTIRatio', 'Age']
# Experiment #4

for model_name, model in models.items():
    pipeline = ImbPipeline(steps=[
        ("smote", SMOTE(random_state=42)),  # Balance classes with SMOTE
        ("classifier", model)
    ])
    pipeline.fit(X_train_vt, y_train)

    # Evaluate the model
    y_pred = pipeline.predict(X_test_vt)
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log results to MLflow
    with mlflow.start_run(run_name=f"Experiment #4 - {model_name}"):
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("f1_score", val_f1)
        mlflow.log_metric("tn", tn)
        mlflow.log_metric("fp", fp)
        mlflow.log_metric("fn", fn)
        mlflow.log_metric("tp", tp)

        # Save model to MLflow
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="models",
            input_example=X_train_vt[:5]
        )

    print(f"{model_name} F1-Score: {val_f1}")
๐Ÿƒ View run Experiment #4 - LogisticRegression at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/80ba5737cf914b48aa0cbe1c2a5c8652
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
LogisticRegression F1-Score: 0.27341632957624795
๐Ÿƒ View run Experiment #4 - RidgeClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/81921cf4336b49fc905091395736efd5
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
RidgeClassifier F1-Score: 0.32513243084167154
๐Ÿƒ View run Experiment #4 - RandomForestClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/a39bc4c22ced4995827eaf03e21f1f47
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
RandomForestClassifier F1-Score: 0.2874913614374568
๐Ÿƒ View run Experiment #4 - XGBClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/2ed127cbe5194671af91956c896bcb5d
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
XGBClassifier F1-Score: 0.29512824926837666
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)

# Apply PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Scree plot
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.title("Scree Plot")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.grid()
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
plt.legend()
plt.show()

# Components to explain 95% variance
n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of components to retain 95% variance: {n_components}")

png

Number of components to retain 95% variance: 7
# Reduce to the optimal number of components
pca = PCA(n_components=n_components)
X_train_reduced = pca.fit_transform(X_train_scaled)
X_test_reduced = pca.transform(X_test_scaled)
X_train_reduced = X_train_reduced.astype(np.float32)
X_test_reduced = X_test_reduced.astype(np.float32)
from sklearn.model_selection import RandomizedSearchCV
# Hyperparameter grids
param_grids = {
    "XGBClassifier": {
        "classifier__max_depth": [3, 5],
        "classifier__learning_rate": [0.1, 0.2],
        "classifier__n_estimators": [50, 100]
    },
    "RandomForestClassifier": {
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [10, None],
        "classifier__class_weight": ["balanced"]
    },
    "RidgeClassifier": {
        "classifier__alpha": [0.1, 1, 10]
    },
    "LogisticRegression": {
        "classifier__C": [0.1, 1, 10]
    }
}

# Train models with hyperparameter tuning
best_f1_scores = {}
for model_name, model in models.items():
    param_grid = param_grids.get(model_name, {})
    pipeline = ImbPipeline(steps=[
        ("smote", SMOTE(random_state=42)),
        ("classifier", model)
    ])

    random_search = RandomizedSearchCV(pipeline, param_grid, n_iter=10, scoring="f1", cv=3, n_jobs=1, random_state=42)
    random_search.fit(X_train_reduced, y_train)

    # Best model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test_reduced)

    # Evaluate performance
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    best_f1_scores[model_name] = val_f1

    # Log results to MLflow
    with mlflow.start_run(run_name=f"Experiment #5 - {model_name}"):
        mlflow.log_params(random_search.best_params_)
        mlflow.log_metric("f1_score", val_f1)
        mlflow.log_metric("tn", tn)
        mlflow.log_metric("fp", fp)
        mlflow.log_metric("fn", fn)
        mlflow.log_metric("tp", tp)

        # Save model to MLflow
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="models",
            input_example=X_train_reduced[:5]
        )

    print(f"{model_name} - Best F1-Score: {val_f1}")
    print(f"False Positives: {fp}, False Negatives: {fn}")
๐Ÿƒ View run Experiment #5 - LogisticRegression at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/3c490daa05df433b92534178e0c1464d
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
LogisticRegression - Best F1-Score: 0.3266132232716882
False Positives: 15152, False Negatives: 1816
๐Ÿƒ View run Experiment #5 - RidgeClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/10dbc326c97740779ae912205c2ce8aa
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
RidgeClassifier - Best F1-Score: 0.32535016768593406
False Positives: 15291, False Negatives: 1808
๐Ÿƒ View run Experiment #5 - RandomForestClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/91013cdb28bb4607b2b741557ad37b30
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
RandomForestClassifier - Best F1-Score: 0.33364063023801543
False Positives: 13952, False Negatives: 1950
๐Ÿƒ View run Experiment #5 - XGBClassifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1/runs/9f3cb33daf3b46ce94e9ca49ecb6b9a6
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/1
XGBClassifier - Best F1-Score: 0.22044019791844396
False Positives: 41004, False Negatives: 117
# Visualize F1-scores
plt.bar(best_f1_scores.keys(), best_f1_scores.values())
plt.ylabel("F1-Score")
plt.title("Experiment #5: F1-Score Comparison")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("experiment5_f1_scores.png")
plt.show()

mlflow.log_artifact("experiment5_f1_scores.png")

png

mlflow.end_run()
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

numerical_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Experiment 6: Decision Tree Classifier
mlflow.set_experiment("Experiment 6: Decision Tree Classifier")
dt_params = {
    'max_depth': 5,
    'min_samples_split': 10,
    'class_weight': 'balanced',  # Penalize false negatives
    'random_state': 42
}

dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(**dt_params))
])

with mlflow.start_run(run_name="Decision Tree Classifier"):
    # Train the model
    dt_pipeline.fit(X_train, y_train)
    y_pred = dt_pipeline.predict(X_test)

    # Evaluate the model
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log parameters, metrics, and confusion matrix
    mlflow.log_param("max_depth", dt_params['max_depth'])
    mlflow.log_param("min_samples_split", dt_params['min_samples_split'])
    mlflow.log_metric("f1_score", val_f1)
    mlflow.log_metric("precision", tp / (tp + fp))
    mlflow.log_metric("recall", tp / (tp + fn))
    mlflow.log_metric("tp", tp)
    mlflow.log_metric("fp", fp)
    mlflow.log_metric("tn", tn)
    mlflow.log_metric("fn", fn)
    mlflow.sklearn.log_model(dt_pipeline, artifact_path="model")

    print(f"Decision Tree - F1-Score: {val_f1}")
    
rf_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'class_weight': 'balanced',  # Penalize false negatives
    'random_state': 42
}

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(**rf_params))
])

mlflow.set_experiment("Updated Random Forest")
with mlflow.start_run(run_name="Random Forest Classifier"):
    # Train the model
    rf_pipeline.fit(X_train, y_train)
    y_pred = rf_pipeline.predict(X_test)

    # Evaluate the model
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log parameters, metrics, and confusion matrix
    mlflow.log_param("n_estimators", rf_params['n_estimators'])
    mlflow.log_param("max_depth", rf_params['max_depth'])
    mlflow.log_metric("f1_score", val_f1)
    mlflow.log_metric("precision", tp / (tp + fp))
    mlflow.log_metric("recall", tp / (tp + fn))
    mlflow.log_metric("tp", tp)
    mlflow.log_metric("fp", fp)
    mlflow.log_metric("tn", tn)
    mlflow.log_metric("fn", fn)
    mlflow.sklearn.log_model(rf_pipeline, artifact_path="model")

    print(f"Random Forest - F1-Score: {val_f1}")
    
xgb_params = {
    'n_estimators': 150,
    'max_depth': 10,
    'learning_rate': 0.1,
    'scale_pos_weight': y_train.value_counts()[0] / y_train.value_counts()[1],  # Balance classes
    'random_state': 42
}

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(**xgb_params))
])

mlflow.set_experiment("Updated XGBoost")
with mlflow.start_run(run_name="XGBoost Classifier"):
    # Train the model
    xgb_pipeline.fit(X_train, y_train)
    y_pred = xgb_pipeline.predict(X_test)

    # Evaluate the model
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log parameters, metrics, and confusion matrix
    mlflow.log_param("n_estimators", xgb_params['n_estimators'])
    mlflow.log_param("max_depth", xgb_params['max_depth'])
    mlflow.log_param("learning_rate", xgb_params['learning_rate'])
    mlflow.log_metric("f1_score", val_f1)
    mlflow.log_metric("precision", tp / (tp + fp))
    mlflow.log_metric("recall", tp / (tp + fn))
    mlflow.log_metric("tp", tp)
    mlflow.log_metric("fp", fp)
    mlflow.log_metric("tn", tn)
    mlflow.log_metric("fn", fn)
    mlflow.sklearn.log_model(xgb_pipeline, artifact_path="model")

    print(f"XGBoost - F1-Score: {val_f1}")
Decision Tree - F1-Score: 0.317016317016317
๐Ÿƒ View run Decision Tree Classifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/5/runs/f381fdd234554ac68849746addb33e31
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/5
Random Forest - F1-Score: 0.3481619224717494
๐Ÿƒ View run Random Forest Classifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/7/runs/00fd0450ec2c42f3ad5580b59d47fc6b
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/7
XGBoost - F1-Score: 0.34516240897006134
๐Ÿƒ View run XGBoost Classifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/8/runs/a0dac646db144e41a6b6f6e6e53c2af9
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/8
from sklearn.ensemble import VotingClassifier

dt_model = DecisionTreeClassifier(max_depth=5, class_weight='balanced', random_state=42)
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=42)
xgb_model = XGBClassifier(n_estimators=150, max_depth=10, learning_rate=0.1, random_state=42)

# Ensemble model with soft voting for better precision-recall balance
voting_model = VotingClassifier(
    estimators=[
        ('Decision Tree', dt_model),
        ('Random Forest', rf_model),
        ('XGBoost', xgb_model)
    ],
    voting='soft'
)

ensemble_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('voting', voting_model)
])

# Train and evaluate the ensemble model
mlflow.set_experiment("Ensemble Model Evaluation")
with mlflow.start_run(run_name="Ensemble Voting Classifier"):

    ensemble_pipeline.fit(X_train, y_train)
    y_pred = ensemble_pipeline.predict(X_test)

    # Evaluate performance
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log metrics and confusion matrix
    mlflow.log_param("voting", "soft")
    mlflow.log_metric("f1_score", val_f1)
    mlflow.log_metric("precision", tp / (tp + fp))
    mlflow.log_metric("recall", tp / (tp + fn))
    mlflow.log_metric("tp", tp)
    mlflow.log_metric("fp", fp)
    mlflow.log_metric("tn", tn)
    mlflow.log_metric("fn", fn)

    mlflow.sklearn.log_model(ensemble_pipeline, artifact_path="model")
    print(f"Ensemble Voting Classifier - F1-Score: {val_f1}")
Ensemble Voting Classifier - F1-Score: 0.341254513393232
๐Ÿƒ View run Ensemble Voting Classifier at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/9/runs/59d67469973d41d28ecdedfcb07ffcd3
๐Ÿงช View experiment at: https://dagshub.com/Shruti-1205/my-first-repo.mlflow/#/experiments/9
import mlflow
import matplotlib.pyplot as plt

mlflow.set_tracking_uri("https://dagshub.com/Shruti-1205/my-first-repo.mlflow")

experiment_names = [
    "Classifier Comparison Experiment",
    "Experiment #3 - Feature Engineering",
    "Experiment #4 - Feature Selection",
    "Experiment #5 - PCA and Hyperparameter Tuning",
    "Experiment 6: Decision Tree Classifier",
    "Updated Random Forest",
    "Updated XGBoost",
    "Ensemble Model Evaluation"
]

f1_scores = {}
missing_data = [] 

for experiment_name in experiment_names:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment:
        experiment_id = experiment.experiment_id
        runs = mlflow.search_runs(
            experiment_ids=[experiment_id],
            filter_string="attributes.status = 'FINISHED'"
        )
        if not runs.empty:
            f1_score = runs['metrics.f1_score'].max() if 'metrics.f1_score' in runs else None
            if f1_score:
                f1_scores[experiment_name] = float(f1_score)
            else:
                missing_data.append(experiment_name)
        else:
            missing_data.append(experiment_name)
    else:
        missing_data.append(experiment_name)

# Sort experiments by F1-score
sorted_f1_scores = {k: v for k, v in sorted(f1_scores.items(), key=lambda item: item[1], reverse=True)}

# Plot F1-scores
plt.figure(figsize=(12, 6))
plt.bar(sorted_f1_scores.keys(), sorted_f1_scores.values(), color='skyblue')
plt.ylabel("F1-Score")
plt.title("F1-Score Comparison Across Experiments")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig("f1_score_comparison_final.png")
plt.show()

if sorted_f1_scores:
    best_model = max(sorted_f1_scores, key=sorted_f1_scores.get)
    print(f"The best-performing model is '{best_model}' with an F1-Score of {sorted_f1_scores[best_model]:.4f}")
else:
    print("No valid F1-scores found. Please check the experiments.")

png

The best-performing model is 'Updated Random Forest' with an F1-Score of 0.3482
numerical_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

rf_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'class_weight': 'balanced',  # Penalize false negatives
    'random_state': 42
}

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(**rf_params))
])

mlflow.set_experiment("Updated Random Forest")
with mlflow.start_run(run_name="Random Forest Classifier"):
    # Train the model
    rf_pipeline.fit(X_train, y_train)
    y_pred = rf_pipeline.predict(X_test)

    # Evaluate the model
    val_f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log parameters, metrics, and confusion matrix
    mlflow.log_param("n_estimators", rf_params['n_estimators'])
    mlflow.log_param("max_depth", rf_params['max_depth'])
    mlflow.log_metric("f1_score", val_f1)
    mlflow.log_metric("precision", tp / (tp + fp))
    mlflow.log_metric("recall", tp / (tp + fn))
    mlflow.log_metric("tp", tp)
    mlflow.log_metric("fp", fp)
    mlflow.log_metric("tn", tn)
    mlflow.log_metric("fn", fn)
    mlflow.sklearn.log_model(rf_pipeline, artifact_path="model")

    print(f"Random Forest - F1-Score: {val_f1}")
Random Forest - F1-Score: 0.34911101577768244
import joblib
import sklearn
joblib.dump((rf_pipeline, {"scikit_learn_version": sklearn.__version__}), "final_rf_pipeline_with_metadata.joblib")
['final_rf_pipeline_with_metadata.joblib']
loaded_model, metadata = joblib.load("final_rf_pipeline_with_metadata.joblib")
from sklearn.metrics import classification_report, confusion_matrix, f1_score

loaded_predictions = loaded_model.predict(X_test)

# Evaluate the predictions
val_f1 = f1_score(y_test, loaded_predictions)
tn, fp, fn, tp = confusion_matrix(y_test, loaded_predictions).ravel()

# Display evaluation metrics
print(f"F1-Score: {val_f1}")
print(f"Confusion Matrix:\n TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")
print("Classification Report:")
print(classification_report(y_test, loaded_predictions))
F1-Score: 0.34911101577768244
Confusion Matrix:
 TN: 33753, FP: 11417, FN: 2238, TP: 3662
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.75      0.83     45170
           1       0.24      0.62      0.35      5900

    accuracy                           0.73     51070
   macro avg       0.59      0.68      0.59     51070
weighted avg       0.86      0.73      0.78     51070
test_sample = X_test.iloc[0:1]
prediction = loaded_model.predict(test_sample)
actual_label = y_test.iloc[0]

print(f"Prediction: {prediction[0]}, Ground Truth: {actual_label}")
Prediction: 0, Ground Truth: 0